import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from openpyxl import load_workbook
import regex
from sklearn.preprocessing import StandardScaler
# Load the Excel file into a DataFrame with a multi-level column index
df = pd.read_excel('veh.xlsx', sheet_name='Sheet2')
r_veh = pd.read_excel('veh.xlsx', sheet_name='Sheet1')
r_veh
| Registered year | Vehicles | Registration_Number | |
|---|---|---|---|
| 0 | 2004 | Toyota Landcruiser 11 Seat Carrier | 1BSX935 |
| 1 | 2005 | Toyota Hilux Ute | 1DHO356 |
| 2 | 2007 | Subaru Outback Wgn Man 2.5l (air bag safe) | 1COS860 |
| 3 | 2011 | Toyota Hilux 4 x 4 Ute Auto Turbo Diesel | 1DPQ704 |
| 4 | 2008 | Coromal PS421 Camper Van | 1TIZ941 |
| ... | ... | ... | ... |
| 62 | 2016 | Kia Cerato YD S 2.0L 6spd Auto Sdn | 1GEZ923 |
| 63 | 2017 | Toyota Rav 4 ZSA42R 2.0 GX CVT Auto | 1GJE113 |
| 64 | 2017 | Toyota Rav 4 ZSA42R 2.0 GX CVT Auto | 1GJE115 |
| 65 | 2019 | Toyota Hiace TRH201R 2.7long auto. rear cargo ... | 1GSL829 |
| 66 | 2019 | Renault Kangoo 1.2EDC Auto petrol Front car van | 1GST515 |
67 rows × 3 columns
# Convert 'Registered year' to integer
r_veh['Registered year'] = pd.to_numeric(r_veh['Registered year'], errors='coerce')
# Drop rows where 'Registered year' could not be converted to an integer
r_veh = r_veh.dropna(subset=['Registered year'])
# Convert 'Registered year' to integer type explicitly
r_veh['Registered year'] = r_veh['Registered year'].astype(int)
# Retry the analysis
vehicle_counts = r_veh['Registered year'].value_counts().sort_index()
# Plotting the distribution of vehicles by year
plt.figure(figsize=(10, 6), facecolor='white')
plt.bar(vehicle_counts.index, vehicle_counts.values, color='blue')
plt.title('Distribution of Vehicles by Registered Year')
plt.xlabel('Year')
plt.ylabel('Number of Vehicles')
plt.grid(True)
plt.show()
# Output the vehicle counts for further analysis
print(vehicle_counts)
1992 1 1997 1 1998 1 2001 2 2004 1 2005 1 2007 2 2008 2 2010 1 2011 2 2012 3 2014 3 2015 1 2016 3 2017 4 2018 14 2019 18 2020 2 2021 3 2022 2 Name: Registered year, dtype: int64
The bar chart shows the number of vehicles registered each year. There's a noticeable increase in vehicle registrations in recent years, particularly in 2018 and 2019.
df
| Vehicle Description | RegistrationNumber | Jan-2022_$ | Jan-2022_Ltr | Feb-2022_$ | Feb-2022_Ltr | Mar-2022_$ | Mar-2022_Ltr | Apr-2022_$ | Apr-2022_Ltr | ... | Aug-2022_$ | Aug-2022_Ltr | Sep-2022_$ | Sep-2022_Ltr | Oct-2022_$ | Oct-2022_Ltr | Nov-2022_$ | Nov-2022_Ltr | Dec-2022_$ | Dec-2022_Ltr | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Toyota Hilux Turbo Diesel (Manual) | 1GCM620 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ... | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 1 | Ford Ranger 2017 Double PU XLT | 1GJE617 | 455.13 | 284.10 | 522.03 | 297.41 | 900.68 | 438.85 | 445.15 | 229.10 | ... | 1405.36 | 729.12 | 2381.88 | 1190.62 | 877.84 | 391.60 | 1092.28 | 509.44 | 714.11 | 340.96 |
| 2 | Toyota Hilux 2005 VZN eng 5VZ1855423 | 1DHO356 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ... | 0.00 | 0.00 | 154.23 | 104.09 | 200.73 | 109.21 | 195.62 | 102.17 | 120.40 | 61.52 |
| 3 | Toyota HiLux 4X4 2.8L Turbo Diesel Dual Cab | 1GCD776 | 0.00 | 0.00 | 0.00 | 0.00 | 523.33 | 245.95 | 58.88 | 25.96 | ... | 155.64 | 73.69 | 83.08 | 38.09 | 514.65 | 229.31 | 0.00 | 0.00 | 0.00 | 0.00 |
| 4 | Toyota Landcruiser diesel | 1BSX935 | 1247.85 | 722.34 | 403.06 | 231.46 | 494.96 | 244.35 | 577.97 | 283.20 | ... | 0.00 | 0.00 | 0.00 | 0.00 | 1781.67 | 701.91 | 806.85 | 309.81 | 0.00 | 0.00 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 135 | VW Combo Maxi Panel Van | 1DNW981 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ... | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 136 | Ford Everest Trend 2.0 DTT RWD | 1GZB575 | 148.92 | 85.54 | 65.77 | 38.53 | 109.50 | 48.03 | 90.19 | 44.94 | ... | 153.62 | 76.75 | 179.17 | 87.13 | 0.00 | 0.00 | 148.81 | 64.45 | 117.50 | 58.52 |
| 137 | Toyota Hilux 4x4 2.8L DSL D/C/C 6AT ST | 1HBB159 | 580.04 | 352.91 | 1266.80 | 721.87 | 695.80 | 344.29 | 562.29 | 284.82 | ... | 918.01 | 493.72 | 1471.86 | 747.11 | 1567.71 | 711.16 | 564.99 | 244.03 | 476.98 | 239.10 |
| 138 | Ford Courier Ute | 1BAC468 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ... | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 139 | Jerry Cans - Vet Farm | Vet Farm Jerry Can | 596.32 | 362.57 | 127.86 | 72.57 | 359.65 | 197.25 | 0.00 | 0.00 | ... | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
140 rows × 26 columns
df.columns
Index(['Vehicle Description', 'RegistrationNumber', 'Jan-2022_$',
'Jan-2022_Ltr', 'Feb-2022_$', 'Feb-2022_Ltr', 'Mar-2022_$',
'Mar-2022_Ltr', 'Apr-2022_$', 'Apr-2022_Ltr', 'May-2022_$',
'May-2022_Ltr', 'Jun-2022_$', 'Jun-2022_Ltr', 'Jul-2022_$',
'Jul-2022_Ltr', 'Aug-2022_$', 'Aug-2022_Ltr', 'Sep-2022_$',
'Sep-2022_Ltr', 'Oct-2022_$', 'Oct-2022_Ltr', 'Nov-2022_$',
'Nov-2022_Ltr', 'Dec-2022_$', 'Dec-2022_Ltr'],
dtype='object')
# Creating a summary of total liters and total cost for each vehicle
df['Total Liters'] = df.filter(like='_Ltr').sum(axis=1)
df['Total Cost'] = df.filter(like='_$').sum(axis=1)
# Displaying the summary along with the vehicle description
summary = df[['Vehicle Description', 'RegistrationNumber', 'Total Liters', 'Total Cost']]
summary.describe(include = 'all')
| Vehicle Description | RegistrationNumber | Total Liters | Total Cost | |
|---|---|---|---|---|
| count | 137 | 106 | 140.000000 | 140.000000 |
| unique | 119 | 96 | NaN | NaN |
| top | Cushman Utility Vehicle | C1417 | NaN | NaN |
| freq | 3 | 3 | NaN | NaN |
| mean | NaN | NaN | 655.144000 | 1336.214214 |
| std | NaN | NaN | 1248.001364 | 2550.852446 |
| min | NaN | NaN | 0.000000 | 0.000000 |
| 25% | NaN | NaN | 0.000000 | 0.000000 |
| 50% | NaN | NaN | 0.000000 | 0.000000 |
| 75% | NaN | NaN | 854.505000 | 1711.545000 |
| max | NaN | NaN | 6912.610000 | 14075.220000 |
summary.head()
| Vehicle Description | RegistrationNumber | Total Liters | Total Cost | |
|---|---|---|---|---|
| 0 | Toyota Hilux Turbo Diesel (Manual) | 1GCM620 | 0.00 | 0.00 |
| 1 | Ford Ranger 2017 Double PU XLT | 1GJE617 | 6912.61 | 14075.22 |
| 2 | Toyota Hilux 2005 VZN eng 5VZ1855423 | 1DHO356 | 741.33 | 1420.29 |
| 3 | Toyota HiLux 4X4 2.8L Turbo Diesel Dual Cab | 1GCD776 | 1631.48 | 3606.87 |
| 4 | Toyota Landcruiser diesel | 1BSX935 | 3635.80 | 8013.80 |
This table shows the total fuel consumption in liters and the corresponding total cost for each vehicle listed. For instance, the Ford Ranger 2017 Double PU XLT consumed 6912.61 liters of fuel, costing a total of $14,075.22 throughout the year.
# Extracting monthly data for cost and liters
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
monthly_costs = df[[f'{month}-2022_$' for month in months]]
monthly_liters = df[[f'{month}-2022_Ltr' for month in months]]
# Remove rows with any missing values
df_cleaned = df.dropna()
df_cleaned["RegistrationNumber"].value_counts()
1GAT667 2
C1417 2
1GDY743 2
1BFT332 2
1AXW429 2
..
1GGG295 1
1GTI631 1
1GVB723 1
1GUH905 1
Vet Farm Jerry Can 1
Name: RegistrationNumber, Length: 94, dtype: int64
# Group by Vehicle Description and get monthwise litre consumption for each
grouped = df_cleaned.groupby('Vehicle Description')[[f'{month}-2022_Ltr' for month in months]]
# Plot monthwise litre consumption for each Vehicle Description separately
plt.figure(figsize=(12, 6))
for vehicle_desc, data in grouped:
plt.plot(months, data.values[0], label=vehicle_desc)
plt.xlabel('Month')
plt.ylabel('Litres')
plt.title('Monthwise Litre Consumption for ' + vehicle_desc)
plt.legend()
plt.show()
# Group by Vehicle Description and get monthwise cost spent for each
grouped2 = df_cleaned.groupby('Vehicle Description')[[f'{month}-2022_$' for month in months]]
# Plot monthwise litre consumption for each Vehicle Description separately
plt.figure(figsize=(12, 6))
for vehicle_desc, data in grouped2:
plt.plot(months, data.values[0], label=vehicle_desc)
plt.xlabel('Month')
plt.ylabel('Cost ($)')
plt.title('Monthwise Monthwise Cost Spent per ' + vehicle_desc)
plt.legend()
plt.show()
pd.set_option('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None)
# Remove rows with any missing values
df_cleaned = df.dropna()
# Descriptive Statistics
descriptive_stats = df_cleaned.describe()
# Time Series Analysis
monthly_trends = df_cleaned.set_index('Vehicle Description').T
# Distribution Analysis
fig, ax = plt.subplots(2, 1, figsize=(12, 10))
sns.histplot(df_cleaned.filter(like='_Ltr'), kde=True, ax=ax[0])
sns.boxplot(data=df_cleaned.filter(like='_Ltr'), ax=ax[1])
plt.tight_layout()
# Correlation Heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df_cleaned.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()
# PCA Analysis
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df_cleaned.filter(like='_Ltr'))
pca = PCA(n_components=2)
pca_results = pca.fit_transform(scaled_data)
# K-Means Clustering
kmeans = KMeans(n_clusters=3, random_state=0).fit(scaled_data)
# Outputting results
print('Descriptive Statistics:\
', descriptive_stats)
print('\
PCA Results:\
', pca_results)
print('\
K-Means Clustering Labels:\
', kmeans.labels_)
C:\Users\anand\AppData\Local\Temp\ipykernel_6984\1486090672.py:19: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. sns.heatmap(df_cleaned.corr(), annot=True, cmap='coolwarm')
C:\Users\anand\anaconda3\envs\aiml\lib\site-packages\sklearn\cluster\_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning warnings.warn( C:\Users\anand\anaconda3\envs\aiml\lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. warnings.warn(
Descriptive Statistics: Jan-2022_$ Jan-2022_Ltr Feb-2022_$ Feb-2022_Ltr Mar-2022_$ \
count 101.000000 101.000000 101.000000 101.000000 101.000000
mean 110.381287 65.618317 134.372475 75.518020 184.040099
std 223.084298 132.756628 259.902012 146.395124 312.681825
min 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.000000 0.000000 0.000000 0.000000
50% 0.000000 0.000000 0.000000 0.000000 0.000000
75% 90.540000 55.310000 186.500000 101.340000 299.030000
max 1247.850000 722.340000 1374.130000 759.200000 1479.030000
Mar-2022_Ltr Apr-2022_$ Apr-2022_Ltr May-2022_$ May-2022_Ltr \
count 101.000000 101.000000 101.000000 101.000000 101.000000
mean 88.914059 95.869109 49.010099 178.518614 88.505842
std 150.623715 172.822913 87.219650 398.447006 196.550328
min 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.000000 0.000000 0.000000 0.000000
50% 0.000000 0.000000 0.000000 0.000000 0.000000
75% 132.220000 117.490000 61.610000 210.150000 108.740000
max 724.590000 727.640000 350.350000 2699.880000 1320.220000
Jun-2022_$ Jun-2022_Ltr Jul-2022_$ Jul-2022_Ltr Aug-2022_$ \
count 101.000000 101.000000 101.000000 101.000000 101.000000
mean 151.719604 69.664554 164.124059 72.510693 148.776733
std 337.586910 154.104850 313.702274 134.703290 288.718907
min 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.000000 0.000000 0.000000 0.000000
50% 0.000000 0.000000 0.000000 0.000000 0.000000
75% 185.700000 84.260000 236.870000 106.130000 211.610000
max 2228.510000 1012.620000 1772.280000 710.570000 1631.220000
Aug-2022_Ltr Sep-2022_$ Sep-2022_Ltr Oct-2022_$ Oct-2022_Ltr \
count 101.000000 101.000000 101.000000 101.000000 101.000000
mean 76.623762 155.437030 79.129901 154.687426 70.510594
std 148.655450 355.670467 178.268329 308.754844 135.105605
min 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.000000 0.000000 0.000000 0.000000
50% 0.000000 0.000000 0.000000 0.000000 0.000000
75% 108.380000 154.230000 82.780000 188.390000 96.850000
max 825.140000 2381.880000 1190.620000 1781.670000 711.160000
Nov-2022_$ Nov-2022_Ltr Dec-2022_$ Dec-2022_Ltr Total Liters \
count 101.000000 101.000000 101.000000 101.000000 101.000000
mean 185.331188 82.727030 141.105347 68.309307 887.042178
std 432.708106 188.828932 367.199506 174.908981 1396.725392
min 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.000000 0.000000 0.000000 0.000000
50% 0.000000 0.000000 0.000000 0.000000 0.000000
75% 206.860000 117.910000 117.120000 56.980000 1479.540000
max 3527.300000 1530.260000 2906.510000 1364.270000 6912.610000
Total Cost
count 101.000000
mean 1804.362970
std 2854.275402
min 0.000000
25% 0.000000
50% 0.000000
75% 2893.850000
max 14075.220000
PCA Results: [[-1.71949428e+00 -9.42266639e-02]
[ 1.11505310e+01 -1.67935163e+00]
[-4.48290733e-01 -1.90100096e-01]
[ 1.55010477e+00 -1.97708464e+00]
[ 6.11057000e+00 -7.36972400e-01]
[ 1.03246473e+00 -5.55218722e-01]
[-1.71949428e+00 -9.42266639e-02]
[-1.71949428e+00 -9.42266639e-02]
[ 2.31348595e+00 1.06317443e+00]
[ 2.62950370e+00 4.45614553e+00]
[-1.71949428e+00 -9.42266639e-02]
[-1.24054641e+00 -2.41185755e-01]
[-1.71949428e+00 -9.42266639e-02]
[ 7.27509256e-01 1.94993354e+00]
[-7.00790673e-01 5.70417863e-02]
[-1.71949428e+00 -9.42266639e-02]
[ 3.69784784e-01 -5.56936064e-01]
[-1.51585127e+00 -1.25796572e-01]
[-1.71949428e+00 -9.42266639e-02]
[-1.20952527e+00 3.30857331e-01]
[ 5.88067716e+00 -1.06054740e+00]
[ 2.69056583e+00 -5.97684433e-01]
[-5.39387100e-01 2.12037025e+00]
[ 6.91396297e+00 2.34583990e+00]
[ 2.71989916e+00 5.84767553e-01]
[-1.71949428e+00 -9.42266639e-02]
[ 4.38515136e+00 3.81978225e-01]
[ 2.00398032e+00 2.66339995e-02]
[-1.71949428e+00 -9.42266639e-02]
[-1.71949428e+00 -9.42266639e-02]
[ 4.17112970e+00 3.21456842e-01]
[ 8.91617901e-01 -4.31213761e-01]
[-1.71949428e+00 -9.42266639e-02]
[ 3.41547733e-01 1.37055137e+00]
[ 3.21332633e-01 -5.82372655e-01]
[ 1.62429460e+00 -3.90110589e-01]
[-1.71949428e+00 -9.42266639e-02]
[ 3.88420393e+00 -3.34071038e-01]
[ 3.91038372e-01 -1.99710254e-01]
[ 7.36125783e+00 1.91884768e+00]
[-1.71949428e+00 -9.42266639e-02]
[-4.83099367e-01 -2.23957634e-01]
[-1.71949428e+00 -9.42266639e-02]
[-7.83422062e-01 -3.13328544e-01]
[ 3.58428919e+00 -2.84883873e+00]
[ 2.07112590e-01 -3.25837416e-01]
[ 2.30620151e-01 -7.81906826e-02]
[ 1.12302365e+00 -3.92060540e-01]
[ 2.34349377e+00 -4.18525435e-02]
[ 5.31388548e+00 -4.24236713e-01]
[ 2.05229867e+00 7.27448930e+00]
[ 7.61019641e-02 -1.27729289e-01]
[-1.71949428e+00 -9.42266639e-02]
[ 4.01239054e+00 -1.90553824e+00]
[ 1.52983398e+00 3.89475295e-01]
[-9.32737557e-01 -1.31379964e-01]
[-1.71949428e+00 -9.42266639e-02]
[-1.71949428e+00 -9.42266639e-02]
[-1.71949428e+00 -9.42266639e-02]
[-1.71949428e+00 -9.42266639e-02]
[-1.71949428e+00 -9.42266639e-02]
[-1.71949428e+00 -9.42266639e-02]
[-1.71949428e+00 -9.42266639e-02]
[-1.71949428e+00 -9.42266639e-02]
[-6.93353750e-01 -7.43395011e-02]
[-1.71949428e+00 -9.42266639e-02]
[-1.71949428e+00 -9.42266639e-02]
[-1.71949428e+00 -9.42266639e-02]
[-1.71949428e+00 -9.42266639e-02]
[-1.71949428e+00 -9.42266639e-02]
[-1.71949428e+00 -9.42266639e-02]
[-1.71949428e+00 -9.42266639e-02]
[-1.71949428e+00 -9.42266639e-02]
[-1.71949428e+00 -9.42266639e-02]
[-1.71949428e+00 -9.42266639e-02]
[-1.71949428e+00 -9.42266639e-02]
[-1.71949428e+00 -9.42266639e-02]
[-1.71949428e+00 -9.42266639e-02]
[-1.71949428e+00 -9.42266639e-02]
[-1.71949428e+00 -9.42266639e-02]
[-1.71949428e+00 -9.42266639e-02]
[-1.71949428e+00 -9.42266639e-02]
[-1.71949428e+00 -9.42266639e-02]
[-1.71949428e+00 -9.42266639e-02]
[-1.71949428e+00 -9.42266639e-02]
[-1.71949428e+00 -9.42266639e-02]
[-1.71949428e+00 -9.42266639e-02]
[-1.71949428e+00 -9.42266639e-02]
[-1.71949428e+00 -9.42266639e-02]
[-1.71949428e+00 -9.42266639e-02]
[-1.71949428e+00 -9.42266639e-02]
[-1.71949428e+00 -9.42266639e-02]
[-1.71949428e+00 -9.42266639e-02]
[ 4.56913740e+00 -2.15633083e+00]
[-1.71949428e+00 -9.42266639e-02]
[-1.71949428e+00 -9.42266639e-02]
[-1.62405851e+00 -1.55527077e-01]
[-3.16676799e-01 -9.26395972e-03]
[ 9.20314721e+00 -7.57981488e-01]
[-1.71949428e+00 -9.42266639e-02]
[-3.69517804e-01 1.21425983e-01]]
K-Means Clustering Labels: [1 2 1 0 2 0 1 1 0 0 1 1 1 0 1 1 0 1 1 1 2 0 0 2 0 1 2 0 1 1 2 0 1 0 0 0 1
2 0 2 1 1 1 1 0 0 0 0 0 2 0 1 1 2 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1]
The correlation values range from moderate to high, indicating a significant relationship between the costs and liters consumed across different months. For instance, the correlation between January 2022 costs and liters is approximately 0.72, suggesting a strong positive relationship. This correlation analysis helps in understanding how closely the costs are related to the fuel consumption across different months.
Descriptive Statistics: Provided a statistical summary of the data.
Time Series Analysis: Analyzed monthly trends.
Distribution Analysis: Visualized the distribution of liters consumed per month.
Correlation Heatmap: Generated a heatmap to visualize correlations between variables.
PCA Analysis: Performed Principal Component Analysis to reduce dimensionality.